setwd("/Users/TARDIS/Documents/STUDIES/context_word_seg")
library(ProjectTemplate)
load.project()

General descriptives

How many utterances in each context?

######
# WL #
######
WL <- df_WL %>% 
  mutate_each(funs(ifelse(. > 0, 1, 0)), -utt, -orth, -phon)  # for word list analysis only, make all contexts either 1 or 0 (smoothing over 1.5's from expand_windows)

WL_sum <- data.frame(context = names(colSums(dplyr::select(WL, -utt, -orth, -phon))), n=colSums(dplyr::select(WL, -utt, -orth, -phon))) %>% 
  mutate(method="word lists")
WL_sum %>% 
  select(-method) %>% 
  arrange(desc(n)) %>% 
  kable()
context n
body_touch 1530
meal 942
fussing 741
bath 664
bed 570
play 528
diaper_dressing 471
media 20
######
# HJ #
######
HJ_sum <- data.frame(context=names(colSums(dplyr::select(df_HJ_bin, -utt, -orth, -phon), na.rm=TRUE)), n=colSums(dplyr::select(df_HJ_bin, -utt, -orth, -phon), na.rm=TRUE)) %>% 
  mutate(method="coder judgments")
HJ_sum %>% 
  select(-method) %>% 
  arrange(desc(n)) %>% 
  kable()
context n
playtime 4780
interaction 4485
fussing 2706
mealtime 1832
bathtime 1230
none 941
diaper change 672
dressing 441
sleep 397
housework 291
touching 283
taking pictures 122
hiccups 80
TV 39
outside 36
sick 19
coughing 15
stretching 13
sneezing 12
swimming 12
music 10
drooling 4
blowing bubbles 2
friends 2
pre-doctors visit 1
######
# TM #
######
LDA_sum <- data.frame(context=names(colSums(dplyr::select(df_LDA_bin, -utt, -orth, -phon), na.rm=TRUE)), n=colSums(dplyr::select(df_LDA_bin, -utt, -orth, -phon), na.rm=TRUE)) %>% 
  mutate(method="LDA")
LDA_sum %>% 
  select(-method) %>% 
  arrange(desc(n)) %>% 
  kable()
context n
topic_11 2292
topic_5 1887
topic_3 1616
topic_4 1544
topic_2 1478
topic_7 1251
topic_6 1187
topic_9 1176
topic_8 1090
topic_12 1020
topic_1 736
topic_10 613
STM_sum <- data.frame(context=names(colSums(dplyr::select(df_STM_bin, -utt, -orth, -phon), na.rm=TRUE)), n=colSums(dplyr::select(df_STM_bin, -utt, -orth, -phon), na.rm=TRUE)) %>% 
  mutate(method="STM")
STM_sum %>% 
  select(-method) %>% 
  arrange(desc(n)) %>% 
  kable()
context n
topic_3 2174
topic_10 1938
topic_4 1775
topic_7 1733
topic_1 1575
topic_2 1392
topic_8 1066
topic_6 930
topic_9 845
topic_5 840
topic_12 548
topic_11 540
all_sum <- rbind(WL_sum, HJ_sum, LDA_sum, STM_sum)

ggplot(all_sum, aes(y=n, x=reorder(as.factor(context), n))) + 
  geom_bar(stat = 'identity', show.legend = FALSE) + 
  geom_hline(aes(yintercept=nrow(df)), lty=2) + 
  facet_wrap(~method, scales = "free_x", ncol=1) + 
  ggtitle("Number of utterances in each context\ndashed line shows total corpus size")

How many contexts per utterance?

######
# WL #
######
WL$N.contexts <- rowSums(dplyr::select(WL, -utt, -orth, -phon))
table(WL$N.contexts)
## 
##    0    1    2    3 
## 8334 3924  693   52
WL <- extract_contexts(WL)
summary(WL$context) # the number of utterances in each context
##       ambiguous            bath             bed      body_touch 
##             745             440             377            1147 
## diaper_dressing         fussing            meal           media 
##             295             557             706              12 
##  no context tag            play 
##            8334             390
length(which(WL$N.contexts > 2)) / length(WL$N.contexts) # percent of corpus with > 2 contexts
## [1] 0.003999077
length(which(WL$N.contexts == 0)) / length(WL$N.contexts) # percent of corpus with 0 contexts
## [1] 0.640929
length(which(WL$N.contexts > 2)) / length(which(WL$N.contexts > 0)) 
## [1] 0.01113729
length(which(WL$N.contexts > 1)) / length(which(WL$N.contexts > 0)) 
## [1] 0.1595631
######
# HJ #
######
df_HJ_bin$N.contexts <- rowSums(dplyr::select(df_HJ_bin, -utt, -orth, -phon), na.rm = TRUE)
table(df_HJ_bin$N.contexts)
## 
##    0    1    2    3 
##  501 6911 5259  332
df_HJ_bin <- extract_contexts(df_HJ_bin)
summary(df_HJ_bin$context) # the number of utterances in each context
##       ambiguous        bathtime   diaper change        dressing 
##            5591             877             292             148 
##         fussing         hiccups       housework     interaction 
##             794              18              99            1335 
##        mealtime  no context tag            none         outside 
##             772             501             171               4 
##        playtime           sleep        sneezing taking pictures 
##            2216             124               2              14 
##        touching              TV 
##              26              19
length(which(df_HJ_bin$N.contexts > 2)) / length(df_HJ_bin$N.contexts) # percent of corpus with > 2 contexts
## [1] 0.02553257
length(which(df_HJ_bin$N.contexts == 0)) / length(df_HJ_bin$N.contexts) # percent of corpus with 0 contexts
## [1] 0.03852957
######
# TM #
######
df_LDA_bin$N.contexts <- rowSums(dplyr::select(df_LDA_bin, -utt, -orth, -phon), na.rm = TRUE)
table(df_LDA_bin$N.contexts)
## 
##    0    1    2    3 
##   31 9100 3350   30
df_LDA_bin <- extract_contexts(df_LDA_bin)
summary(df_LDA_bin$context) # the number of utterances in each context
##      ambiguous no context tag        topic_1       topic_10       topic_11 
##           3380             31            286            300           1440 
##       topic_12        topic_2        topic_3        topic_4        topic_5 
##            570            968            610           1065           1137 
##        topic_6        topic_7        topic_8        topic_9 
##            679            837            758            450
length(which(df_LDA_bin$N.contexts > 2)) / length(df_LDA_bin$N.contexts) # percent of corpus with > 2 contexts
## [1] 0.00239789
length(which(df_LDA_bin$N.contexts == 0)) / length(df_LDA_bin$N.contexts) # percent of corpus with 0 contexts
## [1] 0.00247782
df_STM_bin$N.contexts <- rowSums(dplyr::select(df_STM_bin, -utt, -orth, -phon), na.rm = TRUE)
table(df_STM_bin$N.contexts)
## 
##    0    1    2    3 
##  213 9300 2938   60
df_STM_bin <- extract_contexts(df_STM_bin)
summary(df_STM_bin$context) # the number of utterances in each context
##      ambiguous no context tag        topic_1       topic_10       topic_11 
##           2998            213            621            936            390 
##       topic_12        topic_2        topic_3        topic_4        topic_5 
##            428            750           1432           1178            540 
##        topic_6        topic_7        topic_8        topic_9 
##            540           1256            766            463
length(which(df_STM_bin$N.contexts > 2)) / length(df_STM_bin$N.contexts) # percent of corpus with > 2 contexts
## [1] 0.00479578
length(which(df_STM_bin$N.contexts == 0)) / length(df_STM_bin$N.contexts) # percent of corpus with 0 contexts
## [1] 0.01702502
###########
# OVERALL #
###########
wl_contexts <- dplyr::select(WL, utt, N.contexts, context) %>% 
  mutate(method = "word list")
hj_contexts <- dplyr::select(df_HJ_bin, utt, N.contexts, context) %>% 
  mutate(method = "coder judgments")
lda_contexts <- dplyr::select(df_LDA_bin, utt, N.contexts, context) %>% 
  mutate(method = "LDA")
stm_contexts <- dplyr::select(df_STM_bin, utt, N.contexts, context) %>% 
  mutate(method = "STM")

all_methods_counts <- rbind(wl_contexts, hj_contexts, lda_contexts, stm_contexts) %>% 
  count(method, N.contexts) 

ggplot(all_methods_counts, aes(y=n, x=method, fill=as.factor(N.contexts))) + 
  geom_bar(stat = "identity") + 
  labs(title = "Number of contexts tagged per utterance", y="Number of utterances", x = "Context defined by") + 
  scale_fill_discrete(name="Number of contexts")

df_HJ_bin_no_none <- dplyr::select(df_HJ_bin, -none, -N.contexts, -context) 
df_HJ_bin_no_none$N.contexts <- rowSums(dplyr::select(df_HJ_bin_no_none, -utt, -orth, -phon), na.rm = TRUE)
df_HJ_bin_no_none <- extract_contexts(df_HJ_bin_no_none)
  
hj_no_none <- dplyr::select(df_HJ_bin_no_none, utt, N.contexts, context) %>%
  mutate(method = "coder judgments")

all_methods_counts_no_none <- rbind(wl_contexts, hj_no_none, lda_contexts, stm_contexts) %>% 
  count(method, N.contexts) 

ggplot(all_methods_counts_no_none, aes(y=n, x=method, fill=as.factor(N.contexts))) + 
  geom_bar(stat = "identity") + 
  labs(title = "Number of contexts tagged per utterance,\nnot including 'none' for coder judgments", y="Number of utterances", x = "Context defined by") + 
  scale_fill_discrete(name="Number of contexts")

table(hj_contexts$N.contexts)
## 
##    0    1    2    3 
##  501 6911 5259  332
table(hj_no_none$N.contexts)
## 
##    0    1    2    3 
##  672 7429 4651  251
all_methods_context_count <- rbind(wl_contexts, hj_contexts, lda_contexts, stm_contexts) %>% 
  count(method, context)
all_methods_context_count$context <- relevel(all_methods_context_count$context, "none" )
all_methods_context_count$context <- relevel(all_methods_context_count$context, "no context tag" )
all_methods_context_count$context <- relevel(all_methods_context_count$context, "ambiguous" )

ggplot(all_methods_context_count, aes(y=n, x=method, fill=context)) + 
  geom_bar(stat= "identity") + 
  scale_fill_manual(values = c("#0072B2", "#D55E00", "#E69F00", rep("#999999", length(levels(all_methods_context_count$context)) - 3)))

Printing context files for sharing with CF:

all <- rbind(wl_contexts, hj_no_none, lda_contexts, stm_contexts) %>% 
  dplyr::select(utt, context, method) %>% 
  tidyr::extract(utt, into = c("child", "age", "utt.num"), regex = "^([[:alpha:]]{2})([[:digit:]]{2})[.]cha_([[:digit:]]+)$")

all$utt.num <- as.numeric(all$utt.num)
  
all %>% 
  dplyr::filter(method=="word list") %>% 
  tidyr::spread(key=utt.num, value=context) %>% 
  dplyr::select(-method) %>% 
  write.csv("/Users/TARDIS/Dropbox/2_RoseM_TP/context_files/contexts_file_WL.csv", row.names=FALSE)
all %>% 
  dplyr::filter(method=="coder judgments") %>% 
  tidyr::spread(key=utt.num, value=context) %>% 
  dplyr::select(-method) %>% 
  write.csv("/Users/TARDIS/Dropbox/2_RoseM_TP/context_files/contexts_file_HJ.csv", row.names=FALSE)  
all %>% 
  dplyr::filter(method=="LDA") %>% 
  tidyr::spread(key=utt.num, value=context) %>% 
  dplyr::select(-method) %>% 
  write.csv("/Users/TARDIS/Dropbox/2_RoseM_TP/context_files/contexts_file_LDA.csv", row.names=FALSE)
all %>% 
  dplyr::filter(method=="STM") %>% 
  tidyr::spread(key=utt.num, value=context) %>% 
  dplyr::select(-method) %>% 
  write.csv("/Users/TARDIS/Dropbox/2_RoseM_TP/context_files/contexts_file_STM.csv", row.names=FALSE)  

Select context files to use

Dropping “none” codes from HJ.

WL <- WL
HJ <- df_HJ_bin_no_none
LDA <- df_LDA_bin 
STM <- df_STM_bin 

General thoughts

The word list and coder definitions of context naturally produce a skewed distribution of activities, whereas the topic modeling approaches discover a more uniform distribution of activities. To the extent that the distritbution of activities is naturally skewed (e.g. that a few activities happen very often, and many others happen rarely), topic modeling approaches to identifying activity context may distort reality.

Word list descriptives

Frequency of WL key words?

contexts <- names(WL_contexts)
WL_context_data <- data.frame(NULL)
for(k in contexts){
  WL_context_data <- rbind(WL_context_data, data.frame(word=WL_contexts[[k]], context=k, stringsAsFactors = FALSE))
}

orth_stream <- paste(df$orth, collapse = " ")
# flag bigrams from WL keywords in orth stream, so they don't get separated
for(w_bar in grep(x = WL_context_data$word, pattern = "_", value=TRUE)){
  w_space <- gsub(x=w_bar, pattern="_", replacement=" ")
  orth_stream <- gsub(x=orth_stream, pattern = w_space, replacement = w_bar)
}

orth_stream <- strsplit(orth_stream, split=" ")[[1]]
orth_stream <- orth_stream[orth_stream != ""]

orth_data <- data.frame(word=orth_stream, stringsAsFactors = FALSE) %>% 
  count(word)

WL_context_data <- left_join(WL_context_data, orth_data, by="word") %>% 
  arrange(context, n)

WL_context_data %>% 
  group_by(context) %>% 
  summarise(total=sum(n, na.rm=TRUE), 
            mean.freq=mean(n, na.rm=TRUE), 
            highest=max(n, na.rm=TRUE),
            which.highest=word[which.max(n)]) %>% 
  kable()
context total mean.freq highest which.highest
bath 287 15.105263 47 bath
bed 150 13.636364 43 tired
body_touch 550 20.370370 123 tickle
diaper_dressing 116 6.444444 50 nappie
fussing 216 19.636364 56 ssh
meal 303 8.416667 27 eat
media 4 2.000000 3 television
play 136 7.157895 65 play

HJ

What raw coder tags make up each category?

What words are associated with each category?

Do “none” contexts from HJ match utterances with 0 context codes from WL?

df_HJ_none <- df_HJ_bin
df_HJ_none$HJ_none <- ifelse(df_HJ_none$none==1, 1,
                             ifelse(df_HJ_none$none==0, 0, NA))
df_HJ_none <- dplyr::select(df_HJ_none, utt, HJ_none) 

df_WL_0 <- WL
df_WL_0$WL_0 <- ifelse(df_WL_0$N.contexts == 0, 1,
                             ifelse(df_WL_0$N.contexts > 0, 0, NA))
df_WL_0 <- dplyr::select(df_WL_0, utt, WL_0) 

match <- full_join(df_HJ_none, df_WL_0, by="utt") %>% 
  mutate(match = HJ_none + WL_0)
match$match <- ifelse(match$match == 2, 1, 
                      ifelse(match$match < 2, 0, NA))
nrow(df_WL_0); nrow(df_HJ_none); nrow(match)
## [1] 13003
## [1] 13003
## [1] 13003
tab <- xtabs( ~ WL_0 + HJ_none, data = match)

addmargins(tab)
##      HJ_none
## WL_0      0     1   Sum
##   0    4395   188  4583
##   1    7406   753  8159
##   Sum 11801   941 12742
summary(tab)
## Call: xtabs(formula = ~WL_0 + HJ_none, data = match)
## Number of cases in table: 12742 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 112.78, df = 1, p-value = 2.41e-26
mosaic(tab)

assocplot(tab)

assocstats(tab)
##                     X^2 df P(> X^2)
## Likelihood Ratio 122.91  1        0
## Pearson          112.78  1        0
## 
## Phi-Coefficient   : 0.094 
## Contingency Coeff.: 0.094 
## Cramer's V        : 0.094

What percent of the “none” context utterances in HJ method are 0 context in WL?

sum(match$match, na.rm=TRUE) / sum(match$HJ_none, na.rm=TRUE)
## [1] 0.8002125

What percent of the 0 context utterances in WL method are “none” context in HJ?

sum(match$match, na.rm=TRUE) / sum(match$WL_0, na.rm=TRUE)
## [1] 0.09035277

Topic modeling: LDA

What words are associated with each LDA topic? LDAvis: http://bl.ocks.org/rosemm/raw/a7b1ac43ffe3b49229ed5e866762613f/

# http://cpsievert.github.io/LDAvis/reviews/reviews.html
alpha <-  0.1 # from lda package demo
eta <-  0.1 # from lda package demo
theta <- t(apply(lda$document_sums + alpha, 2, function(x) x/sum(x)))
phi <- t(apply(t(lda$topics) + eta, 2, function(x) x/sum(x)))

D <- length(TM_doc_prep_out$documents)  # number of documents 
W <- length(TM_doc_prep_out$vocab)  # number of terms in the vocab
doc.length <- document.lengths(TM_doc_prep_out$documents)  # number of tokens per document
N <- sum(doc.length)  # total number of tokens in the data
term.frequency <- word.counts(TM_doc_prep_out$documents, vocab = TM_doc_prep_out$vocab)


lda_data <- list(phi = phi,
                 theta = theta,
                 doc.length = doc.length,
                 vocab = TM_doc_prep_out$vocab,
                 term.frequency = as.integer(term.frequency))

json <- createJSON(phi = phi, 
                   theta = theta, 
                   doc.length = doc.length, 
                   vocab = TM_doc_prep_out$vocab, 
                   term.frequency = as.integer(term.frequency))

serVis(json, as.gist = TRUE) 

Wordles, showing the frequencies of words in the utterances assigned to each topic.

cloud_from_df(df_LDA_bin, "topic_1")

cloud_from_df(df_LDA_bin, "topic_2")

cloud_from_df(df_LDA_bin, "topic_3")

cloud_from_df(df_LDA_bin, "topic_4")

cloud_from_df(df_LDA_bin, "topic_5")

cloud_from_df(df_LDA_bin, "topic_6")

cloud_from_df(df_LDA_bin, "topic_7")

cloud_from_df(df_LDA_bin, "topic_8")

cloud_from_df(df_LDA_bin, "topic_9")

cloud_from_df(df_LDA_bin, "topic_10")

cloud_from_df(df_LDA_bin, "topic_11")

cloud_from_df(df_LDA_bin, "topic_12")

Topic modeling: STM

What words are associated with each STM topic?

summary(stm)
## A topic model with 12 topics, 427 documents and a 531 word dictionary.
## Topic Words:
##  Topic 1: side, lost, dear, round, ha, said, teddi 
##  Topic 2: bop, hello, monkey, give, hi, thing, say 
##  Topic 3: shh, hey, shake, rabbit, els, chou, light 
##  Topic 4: door, lunch, minut, sit, lucki, sweetheart, watch 
##  Topic 5: tickl, toe, feet, din, tick, tum, boy 
##  Topic 6: away, one, nappi, will, wee, sorri, milk 
##  Topic 7: hair, water, bad, fun, minut, deari, drink 
##  Topic 8: bum, bath, splash, swim, shake, dad, mum 
##  Topic 9: boo, bye, ah, clever, hide, see, tea 
##  Topic 10: hand, see, naughti, fed, can, bite, look 
##  Topic 11: juic, whether, duck, quack, miss, leav, madam 
##  Topic 12: dear, got, oh, hello, shame 
##  
##  Covariate Words:
##  Group cr: grumpi, gut, gorgeous, hmm, full, fast, shall 
##  Group gi: jumper, throat, without, keep, christoph, bash, reach 
##  Group gl: precious, handi, snooz, temper, mat, slept, happi 
##  Group la: hannah, busi, usual, nosh, prove, mummi, whose 
##  Group st: mobil, oven, cradl, appl, pussycat, cupsi, shoulder 
##  
##  Topic-Covariate Interactions:
##  Topic 1, Group cr: pretti, forget, nail, whee, let, dress, tear 
##  Topic 1, Group gi: clap, hand, hiccough, might, hah, christoph, left 
##  Topic 1, Group gl: struggl, walk, lost, aw, danc, punch, chin 
##  Topic 1, Group la: dub, bib, dear, bath, someth, hurt, whee 
##  Topic 1, Group st: hmm, stuck, okay, push, myron, kick, camera 
##  
##  Topic 2, Group cr: ticki, shake, excit, bore, gillian, chou, camera 
##  Topic 2, Group gi: oop, us, stuff, bash, christoph, hello, friend 
##  Topic 2, Group gl: hmm 
##  Topic 2, Group la: pretti, skirt, seen, sunshin, kick, hello, bop 
##  Topic 2, Group st: foot, attent, grab, must, nose, matter, play 
##  
##  Topic 3, Group cr: shush, matter, pleas, dolli, gillian, ah, hey 
##  Topic 3, Group gi: littl, re, wind, kick, shake, lad, chou 
##  Topic 3, Group gl: bop, rest, talk, oop, tri, bore, bib 
##  Topic 3, Group la: polli, near, thank, oop, bubbl, treasur, ho 
##  Topic 3, Group st: food, swing, smile, myron, suppos, move, ssh 
##  
##  Topic 4, Group cr: girl, love, nose, bubbl, littl, bless 
##  Topic 4, Group gi: gonna, lad, ticklish, clean, never, mum, hous 
##  Topic 4, Group gl: first, know, ssh, stuff, wet, will, happen 
##  Topic 4, Group la: iron, arm, downstair, cuddl, god, mm, quiet 
##  Topic 4, Group st: daddi, goin, readi, burp, wind, ya, wash 
##  
##  Topic 5, Group cr: belli, soft, wee, stori, din, nose, shout 
##  Topic 5, Group gi: clever, parrot, yum, complain, sack, daisi, terribl 
##  Topic 5, Group gl: find, tum, old, hah, dinner, thumb, lie 
##  Topic 5, Group la: excit, three, beebo, two, tum, get, one 
##  Topic 5, Group st: feet, toe, boy 
##  
##  Topic 6, Group cr: sleepi, huh, head, shout, dolli, eye, chair 
##  Topic 6, Group gi: burp, suck, vest, carri, roll, bib, finger 
##  Topic 6, Group gl: hide, astra, three, ah, daisi, hiccup, chang 
##  Topic 6, Group la: sunshin, clean, wee, straight, nappi, morn, one 
##  Topic 6, Group st: charl, bib, ever, gonna, forget, tri, whee 
##  
##  Topic 7, Group cr: gonna, tell, cross, patch, shout, madam, stori 
##  Topic 7, Group gi: fatti, relax, grab, ow, walk, lift, dad 
##  Topic 7, Group gl: petal, tale, pop, pet, bath, wash, dirti 
##  Topic 7, Group la: dub, stori, tell, daddi, rub, mum, wash 
##  Topic 7, Group st: lulu, hot, bottl, wide, juic, garden, first 
##  
##  Topic 8, Group cr: bath, bum 
##  Topic 8, Group gi: arm, struggl, naughti, stick, nail, wash, let 
##  Topic 8, Group gl: stori, wee, bubbl, next, ha, aw, mess 
##  Topic 8, Group la: will, splish, splosh, tonight, whee, air, mm 
##  Topic 8, Group st: say, anyth, cuddl, myron, shake, milk, hello 
##  
##  Topic 9, Group cr: oop, cross, patch, shush, sack, re, babi 
##  Topic 9, Group gi: parrot, everyth, best, mouth, cup, teeth, hair 
##  Topic 9, Group gl: astra, myron, anoth, tomorrow, yeah, clean, one 
##  Topic 9, Group la: lambchop, sleepi, watch, chou, love, girl, tri 
##  Topic 9, Group st: hide, stand, ah, littl, bye 
##  
##  Topic 10, Group cr: toe, stretch, game, play, can, tongu, goin 
##  Topic 10, Group gi: clap, splash, shout, push, stand, soon, ya 
##  Topic 10, Group gl: pretti, pram, can, bear, afternoon, play, tell 
##  Topic 10, Group la: lambchop, chou, friend, littl, smile, hmm, daddi 
##  Topic 10, Group st: polli, lewi, pastri, joseph, rain, though, thank 
##  
##  Topic 11, Group cr: attent, ya, bib, matter, think, minut, alright 
##  Topic 11, Group gi: duck, quack, bath, will, look, left, minut 
##  Topic 11, Group gl: get, sunshin, dress, tongu, arm, stori, never 
##  Topic 11, Group la: paddi, donald, yum, quack, came, point, silli 
##  Topic 11, Group st: meringu, dub, pie, lemon, rub, joseph, cupboard 
##  
##  Topic 12, Group cr: shame, burp, fatti, hicki, mind, poor, never 
##  Topic 12, Group gi: nose, stretch, daddi, size, christoph 
##  Topic 12, Group gl: smile, daddi, dad, bless, tis, stay, girli 
##  Topic 12, Group la: ssh, trea, pie, alright, armi, darl, funni 
##  Topic 12, Group st:  
## 
# http://cpsievert.github.io/LDAvis/reviews/reviews.html
toLDAvis(stm, TM_doc_prep_out$documents, as.gist = TRUE) # This function does not yet allow content covariates.

Wordles, showing the probability of each word given the topic.

cloud(stm, topic=1)

cloud(stm, topic=2)

cloud(stm, topic=3)

cloud(stm, topic=4)

cloud(stm, topic=5)

cloud(stm, topic=6)

cloud(stm, topic=7)

cloud(stm, topic=8)

cloud(stm, topic=9)

cloud(stm, topic=10)

cloud(stm, topic=11)

cloud(stm, topic=12)

Wordles, showing the frequencies of words in the utterances assigned to each topic.

cloud_from_df(df_STM_bin, "topic_1")

cloud_from_df(df_STM_bin, "topic_2")

cloud_from_df(df_STM_bin, "topic_3")

cloud_from_df(df_STM_bin, "topic_4")

cloud_from_df(df_STM_bin, "topic_5")

cloud_from_df(df_STM_bin, "topic_6")

cloud_from_df(df_STM_bin, "topic_7")

cloud_from_df(df_STM_bin, "topic_8")

cloud_from_df(df_STM_bin, "topic_9")

cloud_from_df(df_STM_bin, "topic_10")

cloud_from_df(df_STM_bin, "topic_11")

cloud_from_df(df_STM_bin, "topic_12")